Loading Packages and Data

library(tidyverse)
library(naniar)
library(simputation)
library(ggplot2)
library(plotly)
# library(readr)
state_data <- read_csv("state_data_updated.csv")
context_state_data <- read_csv("context_state_data.csv")



The state_data csv contains “geographic and population break downs,cohort years and sizes, rate statistics and outcome statistics”

The context_state_data csv contains context variables to support comparison and contextualization of criminal justice caseload and socioeconomic outcome statistics.



Exploring Large Data

state_data %>%
  head()
## # A tibble: 6 × 77
##    fips cohort_year   sex  race age_group repeat_contact off_type fe_rate
##   <dbl>       <dbl> <dbl> <dbl>     <dbl>          <dbl>    <dbl>   <dbl>
## 1     4        2000     0     0         0              0        0    1554
## 2     4        2000     0     0         0              1        0      NA
## 3     4        2000     0     0         0              2        0      NA
## 4     4        2000     0     0         0              3        0      NA
## 5     4        2000     0     0         0              0        1     302
## 6     4        2000     0     0         0              1        1      NA
## # ℹ 69 more variables: N_fe_rate <dbl>, fe_any_w2_y1 <dbl>,
## #   N_fe_any_w2_y1 <dbl>, fe_any_w2_y3 <dbl>, N_fe_any_w2_y3 <dbl>,
## #   fe_any_w2_y5 <dbl>, N_fe_any_w2_y5 <dbl>, fe_w2_wages_y1 <dbl>,
## #   N_fe_w2_wages_y1 <dbl>, fe_w2_wages_y3 <dbl>, N_fe_w2_wages_y3 <dbl>,
## #   fe_w2_wages_y5 <dbl>, N_fe_w2_wages_y5 <dbl>, inc_rate <dbl>,
## #   N_inc_rate <dbl>, inc_any_w2_y1 <dbl>, N_inc_any_w2_y1 <dbl>,
## #   inc_any_w2_y3 <dbl>, N_inc_any_w2_y3 <dbl>, inc_any_w2_y5 <dbl>, …

state_data %>%
  summarize(range(cohort_year))
## # A tibble: 2 × 1
##   `range(cohort_year)`
##                  <dbl>
## 1                 2000
## 2                 2022

context_state_data %>%
  head()
## # A tibble: 6 × 7
##   fips  N_contextdata violent property poverty health unemployment
##   <chr>         <dbl>   <dbl>    <dbl>   <dbl>  <dbl>        <dbl>
## 1 01          4886307     438     3322   0.176   1.79        0.059
## 2 02           733306     655     2871   0.106  -0.33        0.07 
## 3 04          6933221     420     3212   0.162  -0.15        0.059
## 4 05          2989074     499     3579   0.177   1.47        0.055
## 5 06         38734434     418     2606   0.145  -0.76        0.072
## 6 08          5501284     321     2656   0.113  -0.57        0.049


We can see that the data collected ranges from the years 2000 to 2022.


Cleaning and Wrangling data


# Getting rid of leading 0's in fips code
context_state_data <- context_state_data %>%
  mutate(fips = as.integer(fips))

# Create a data frame with FIPS codes and state names
fips_codes <- data.frame(
  state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"),
  fips = c(1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56)
)

# Join state data to fips codes
state_data <- state_data %>%
  inner_join(fips_codes, by = "fips")

context_state_data <- context_state_data %>%
  inner_join(fips_codes, by = "fips")

# state_data %>%
#   summary()



# separating variables into different data sets

fe_state_data <- state_data %>%
  filter(sex == 0, race == 0, age_group == 0, off_type == 0, repeat_contact == 0) %>%   # filter race, sex, age, offense type, and repeat contact == 0 (all)
  select(fips, state, cohort_year, fe_rate, N_fe_rate, fe_any_w2_y1, N_fe_any_w2_y1, fe_w2_wages_y1, N_fe_w2_wages_y1) %>%
  na.omit()

inc_state_data <- state_data %>%
    filter(sex == 0, race == 0, age_group == 0, off_type == 0, repeat_contact == 0) %>%   
  select(fips, state, cohort_year, inc_rate, N_inc_rate, inc_any_w2_y1, N_inc_any_w2_y1, inc_w2_wages_y1, N_inc_w2_wages_y1) %>%
  na.omit()

mi_state_data <- state_data %>%
    filter(sex == 0, race == 0, age_group == 0, off_type == 0, repeat_contact == 0) %>%   
  select(fips, state, cohort_year, mi_rate, N_mi_rate, mi_any_w2_y1, N_mi_any_w2_y1, mi_w2_wages_y1, N_mi_w2_wages_y1) %>%
  na.omit()



# Viewing number of rows in each data set (number of observations)
nrow(fe_state_data)
## [1] 180
nrow(inc_state_data)
## [1] 203
nrow(mi_state_data)
## [1] 165




# tidyverse function rename(New_Name = Old_Name)

# felony
fe_data <- fe_state_data %>% 
  rename(state_pop = N_fe_rate, cohort_size = N_fe_any_w2_y1, year = cohort_year) %>%
  select(!N_fe_w2_wages_y1)


# incarcerated
inc_data <- inc_state_data %>% 
  rename(state_pop = N_inc_rate, cohort_size = N_inc_any_w2_y1, year = cohort_year) %>%
  select(!N_inc_w2_wages_y1)



# misdemeanor
mi_data <- mi_state_data %>%
  rename(state_pop = N_mi_rate, cohort_size = N_mi_any_w2_y1, year = cohort_year) %>%
  select(!N_mi_w2_wages_y1)



Variable descriptions and details

  • fips/state: state in which… different for each variable..
  • year: based on charge disposition date, or when an individual was convicted
  • fe_rate: per capita rate of felony charges per 100,000 residents, based on charge filing date
  • state_pop: population of the state
  • fe_any_w2_y1: proportion of individuals with at least one W2 filed one year after a felony charge per cohort per state
  • cohort_size: Group of people with felony charges based on the date the charge was filed
  • fe_w2_wages_y1: average income of individuals one year after a felony charge

Variables for other categories or data sets are similar.



Visualizations


# felony
fe_data %>%
  ggplot(aes(x = year, y = fe_any_w2_y1)) +
  geom_point(size = 1)+
  geom_line()+
  facet_wrap(~state)+
 labs(
    title = "Yearly Proportion of Individuals Filing W2 One Year After Felony Charge by State",
    x = "Year",
    y = "Proportion of Individuals with W2 Filed"
  )+
  theme_minimal()


# incarcerated
inc_data %>%
  ggplot(aes(x = year, y = inc_any_w2_y1)) +
  geom_point(size = 1)+
  geom_line()+
  facet_wrap(~state)+
 labs(
    title = "Yearly Proportion of Individuals Filing W2 One Year After Release by State",
    x = "Year",
    y = "Proportion of Individuals with W2 Filed"
  )+
  theme_minimal()


# misdemeanor
mi_data %>%
  ggplot(aes(x = year, y = mi_any_w2_y1)) +
  geom_point(size = 1)+
  geom_line()+
  facet_wrap(~state)+
 labs(
    title = "Yearly Proportion of Individuals Filing W2 One Year After Misdemeanor Charge by State",
    x = "Year",
    y = "Proportion of Individuals with W2 Filed"
  )+
  theme_minimal()




# Combine the data into one data frame
combined_data <- bind_rows(
  fe_data %>% mutate(category = "Felony", any_w2_y1 = fe_any_w2_y1),
  inc_data %>% mutate(category = "Incarcerated", any_w2_y1 = inc_any_w2_y1),
  mi_data %>% mutate(category = "Misdemeanor", any_w2_y1 = mi_any_w2_y1)
) %>% 
  select(year, state, category, any_w2_y1)

# Create the combined plot
ggplot(combined_data, aes(x = year, y = any_w2_y1, color = category)) +
  geom_point(size = 1) +
  geom_line() +
  facet_wrap(~state) +
  labs(
    title = "Yearly Proportion of Individuals Filing W2 One Year After Felony, Incarceration, or Misdemeanor by State",
    x = "Year",
    y = "Proportion of Individuals with W2 Filed",
    color = "Category"
  ) +
  theme_minimal()




Combining CJARS data with other collected data

indeed <- read_csv("indeed-data.csv")

pledge <- read_csv("pledge-data.csv")

CJARS and Indeed Scraped Data


state_data <- tibble(
  state = state.name,
  abb = state.abb
)

indeed <- indeed %>%
  left_join(state_data)

indeedCJARS <- fe_data %>%
  left_join(indeed) %>%
  filter(year == 2017)

indeedCJARS %>%
  ggplot(aes(x = avgFair, y = fe_any_w2_y1, label = abb)) +
  geom_text() +
  labs(title = "Felony Charges", y = "Proportion of Individuals with W2 Filed", x = "Fair Chance Job postings per 1,000 postings") +
  # geom_smooth(method = "lm")+
  theme_minimal()


Pledge Data

# Creating an index
pledge_index <- pledge %>%
  group_by(state) %>%
  summarize(
    yes_count = sum(pledge == "Yes"),
    no_count = sum(pledge == "No"),
    index = yes_count / (yes_count + no_count)
  ) %>%
  mutate(index = case_when(
    is.na(index) ~ 0,      # for cases where there are no yes or no counts
    TRUE ~ round(index, 2)
  ))

pledge_index <- pledge_index %>%
  left_join(state_data)

indeedPledge <- pledge_index %>%
  left_join(indeed) 

plot <- indeedPledge %>%
  ggplot(aes(x = index)) +
  geom_bar()+
  labs(x = "Pledge Index", y = "Count", title = "Count of Pledge Index for all 50 States")+
  theme_minimal()

ggplotly(plot)

# indeedPledge %>%
#   ggplot(aes(x = avgFair, y = index, label = abb)) +
#   geom_text() +
#   labs(title = "Pledges", y = "State Pledge Index", x = "Fair Chance Job postings per 1,000 postings") +
#   theme_minimal()
employer_counts <- pledge %>%
  filter(pledge == "Yes") %>%
  count(employer, sort = TRUE)

top_employers <- employer_counts %>%
  top_n(9, n)

plot2 <- top_employers %>%
  ggplot(aes(x = reorder(employer, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flips the coordinates for better readability
  labs(x = "Employer", y = "Number of 'Yes' Pledges", title = "Top 9 Employers with 'Yes' Pledges") +
  theme_minimal()

ggplotly(plot2)

There are only 9 unique employers that have signed the Fair Chance Pledge. 21 of these instances were from Walmart

Linear Regression Models

# Create the linear regression model
model <- lm(fe_any_w2_y1 ~ avgFair, data = indeedCJARS)

summary(model)
## 
## Call:
## lm(formula = fe_any_w2_y1 ~ avgFair, data = indeedCJARS)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.055748 -0.032179 -0.022206 -0.001103  0.115873 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.562533   0.072864   7.720 9.15e-06 ***
## avgFair     -0.004087   0.004547  -0.899    0.388    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05442 on 11 degrees of freedom
## Multiple R-squared:  0.06842,    Adjusted R-squared:  -0.01627 
## F-statistic: 0.8079 on 1 and 11 DF,  p-value: 0.388



Interpreting the intercept doesn’t make much sense. It is saying that when there are zero fair chance postings per 1000 Indeed job postings, the proportion of individuals with a filed W2 one year after a felony conviction is approximately 0.5625

Interpreting the coefficient: For each additional fair chance job per 1000 Indeed posts, the proportion of individuals with a filed W2 one year after a felony conviction decreases by 0.004087.

The p-value is greater than 0.05 which suggests that the model is not statistically significant. This means that the indeed job postings do not provide a meaningful explanation of W2s filed.

 allMeasures <- indeedCJARS %>%
  left_join(pledge_index)

model2 <- lm(fe_any_w2_y1 ~ avgFair + index, data = allMeasures)

summary(model2)
## 
## Call:
## lm(formula = fe_any_w2_y1 ~ avgFair + index, data = allMeasures)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05826 -0.02841  0.00258  0.01428  0.09347 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.550207   0.064989   8.466 7.15e-06 ***
## avgFair     -0.001657   0.004218  -0.393   0.7028    
## index       -0.169248   0.085108  -1.989   0.0748 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04832 on 10 degrees of freedom
## Multiple R-squared:  0.3324, Adjusted R-squared:  0.1989 
## F-statistic:  2.49 on 2 and 10 DF,  p-value: 0.1326

Coefficients:

For each additional fair chance job per 1000 Indeed posts, the proportion of individuals with a filed W2 one year after a felony conviction decreases by approximately 0.001657.

For each additional top business that has signed the fair chance pledge, the proportion of individuals with a filed W2 one year after a felony conviction decreases by approximately 0.169248

This doesn’t really make sense…

policy_stuff <- read_csv("policy_stuff2.csv")

policy_stuff %>%
  ggplot(aes(y = avgFair, x = Average_Index, label= state))+
  geom_text() +
  geom_smooth(method = "lm", se = FALSE)+
  labs(x = "Average Index", y = "Fair Chance Job postings per 1,000 postings", title = "Indeed & Average Index")+
  theme_minimal()


policy_stuff %>%
  ggplot(aes(y = avgFair, x = Negative_Index, label= state))+
  geom_text()+
  geom_smooth(method = "lm", se = FALSE)+
  labs(x = "Negative Index", y = "Fair Chance Job postings per 1,000 postings", title = "Indeed & Negative Index")+
  theme_minimal()


# policy_stuff %>%
#   ggplot(aes(x = avgFair, y = Value_1, label= state))+
#   geom_text()
# 
# policy_stuff %>%
#   ggplot(aes(x = avgFair, y = Value_5, label= state, color = Value_4))+
#   geom_text()
policyCJARS <- fe_data %>%
  left_join(policy_stuff) %>%
  filter(year == 2017)

policyCJARS %>%
  ggplot(aes(y = fe_any_w2_y1, x = Negative_Index, label= state))+
  geom_text()+
  geom_smooth(method = "lm", se = FALSE)+
  labs(x = "Negative Index", y = "Proportion of Individuals with W2 Filed", title = "Indeed & Negative Index")+
  theme_minimal()


policyCJARS %>%
  ggplot(aes(y = fe_any_w2_y1, x = Average_Index, label= state))+
  geom_text()+
  geom_smooth(method = "lm", se = FALSE)+
  labs(x = "Average Index", y = "Proportion of Individuals with W2 Filed", title = "Indeed & Average Index")+
  theme_minimal()


# policyCJARS %>%
#   ggplot(aes(y = fe_any_w2_y1, x = Value_5, label= state, color = Value_4))+
#   geom_text() +
#   geom_smooth(method = "lm")